R Markdown

library(dplyr)
library(readr)
data <- read_csv("CA_divorce.csv")
full <- read_csv("usa_00002.csv")
colnames(data) <- tolower(colnames(data))
colnames(full) <- tolower(colnames(full))
datacopy <- data
fullcopy <- full

rename factors, adding additional features

datacopy$year <- as.factor(datacopy$year)
datacopy$sex <- as.factor(datacopy$sex)
levels(datacopy$sex) <- c("male", "female")
datacopy$marrno <- as.factor(datacopy$marrno)
datacopy$marst <- as.factor(datacopy$marst)
datacopy$age_interval <- cut(datacopy$age, c(15, 19, 29, 39, 49, 59, 69, 79, 89, 99), labels = c("10s", "20s", "30s", "40s", "50s", "60s", "70s", "80s", "90s"), include.lowest = TRUE)
levels(datacopy$marst) <- c("Married_spouse_present", "Married_spouse_absent", "Separated", "Divorced", "Widowed", "Single")
divorce <- datacopy %>% filter(marst == "Divorced")
divorce$age_interval <- cut(divorce$age, c(15, 19, 29, 39, 49, 59, 69, 79, 89, 99), labels = c("10s", "20s", "30s", "40s", "50s", "60s", "70s", "80s", "90s"), include.lowest = TRUE)
divorce$age_interval <- as.factor(divorce$age_interval)

yrmarr not important because it is correlated with “age”

fullcopy$year <- as.factor(fullcopy$year)
fullcopy$sex <- as.factor(fullcopy$sex)
levels(fullcopy$sex) <- c("male", "female")
fullcopy$marrno <- as.factor(fullcopy$marrno)
fullcopy$marst <- as.factor(fullcopy$marst)
levels(fullcopy$marst) <- c("Married_spouse_present", "Married_spouse_absent", "Separated", "Divorced", "Widowed", "Single")
divorcef <- fullcopy %>% filter(marst == "Divorced")
divorcef$age_interval <- cut(divorcef$age, c(15, 19, 29, 39, 49, 59, 69, 79, 89, 99), labels = c("10s", "20s", "30s", "40s", "50s", "60s", "70s", "80s", "90s"), include.lowest = TRUE)
divorcef$age_interval <- as.factor(divorcef$age_interval)
memory.limit(2000000)
## [1] 2e+06
newdf <- fullcopy %>% filter(year == 2019, statefip == 6)
newdf$age_interval <- cut(newdf$age, c(15, 19, 29, 39, 49, 59, 69, 79, 89, 99), labels = c("10s", "20s", "30s", "40s", "50s", "60s", "70s", "80s", "90s"), include.lowest = TRUE)
newdf$age_interval <- as.factor(newdf$age_interval)
teen <- newdf %>% filter(age_interval == "10s")

States

library(plotly)
## Loading required package: ggplot2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
memory.limit(240000)
## Warning in memory.limit(240000): cannot decrease memory limit: ignored
## [1] 2e+06
nation <- divorcef %>% group_by(statefip, year) %>% summarise(count = n())
## `summarise()` regrouping output by 'statefip' (override with `.groups` argument)
#merged <- merge(divorcef, nation, by = c("statefip", "year"))


nation$statefip <- as.factor(nation$statefip)
levels(nation$statefip) <- c("AL","AK","AZ","AR","CA","CO","CT","DE","DC","FL","GA", "HI","ID","IL","IN","IA","KS","KY","LA","ME","MD","MA","MI","MN","MS","MO","MT","NE","NV","NH","NJ","NM","NY","NC","ND","OH","OK","OR","PA","RI","SC","SD","TN","TX","UT","VT","VA","WA","WV","WI","WY")

fig <- plot_ly(type = "choropleth", locations = nation$statefip, locationmode = "USA-states", z = nation$count, scope = 'usa', frame = nation$year) %>% 
  layout(title = "2009-2019 US Divorce Count Trend", geo = list(
  scope = 'usa',
  projection = list(type = 'albers usa'),
  showlakes = TRUE,
  lakecolor = toRGB('white')
)) %>% style(hoverlabel = list(bgcolor = 'white'))
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## Warning: 'choropleth' objects don't have these attributes: 'scope'
## Valid attributes include:
## 'type', 'visible', 'legendgroup', 'name', 'uid', 'ids', 'customdata', 'meta', 'selectedpoints', 'hoverlabel', 'stream', 'transforms', 'uirevision', 'locations', 'locationmode', 'z', 'geojson', 'featureidkey', 'text', 'hovertext', 'marker', 'selected', 'unselected', 'hoverinfo', 'hovertemplate', 'showlegend', 'zauto', 'zmin', 'zmax', 'zmid', 'colorscale', 'autocolorscale', 'reversescale', 'showscale', 'colorbar', 'coloraxis', 'geo', 'idssrc', 'customdatasrc', 'metasrc', 'locationssrc', 'zsrc', 'textsrc', 'hovertextsrc', 'hoverinfosrc', 'hovertemplatesrc', 'key', 'set', 'frame', 'transforms', '_isNestedKey', '_isSimpleKey', '_isGraticule', '_bbox'
fig 
## Warning: 'choropleth' objects don't have these attributes: 'scope'
## Valid attributes include:
## 'type', 'visible', 'legendgroup', 'name', 'uid', 'ids', 'customdata', 'meta', 'selectedpoints', 'hoverlabel', 'stream', 'transforms', 'uirevision', 'locations', 'locationmode', 'z', 'geojson', 'featureidkey', 'text', 'hovertext', 'marker', 'selected', 'unselected', 'hoverinfo', 'hovertemplate', 'showlegend', 'zauto', 'zmin', 'zmax', 'zmid', 'colorscale', 'autocolorscale', 'reversescale', 'showscale', 'colorbar', 'coloraxis', 'geo', 'idssrc', 'customdatasrc', 'metasrc', 'locationssrc', 'zsrc', 'textsrc', 'hovertextsrc', 'hoverinfosrc', 'hovertemplatesrc', 'key', 'set', 'frame', 'transforms', '_isNestedKey', '_isSimpleKey', '_isGraticule', '_bbox'
Sys.setenv("plotly_username" = "hanaylim")
Sys.setenv("plotly_api_key" = "ERfN7JPfQfV9QJlVnj0H")
api_create(fig, "Divorce in US 2019")
## Warning: 'choropleth' objects don't have these attributes: 'scope'
## Valid attributes include:
## 'type', 'visible', 'legendgroup', 'name', 'uid', 'ids', 'customdata', 'meta', 'selectedpoints', 'hoverlabel', 'stream', 'transforms', 'uirevision', 'locations', 'locationmode', 'z', 'geojson', 'featureidkey', 'text', 'hovertext', 'marker', 'selected', 'unselected', 'hoverinfo', 'hovertemplate', 'showlegend', 'zauto', 'zmin', 'zmax', 'zmid', 'colorscale', 'autocolorscale', 'reversescale', 'showscale', 'colorbar', 'coloraxis', 'geo', 'idssrc', 'customdatasrc', 'metasrc', 'locationssrc', 'zsrc', 'textsrc', 'hovertextsrc', 'hoverinfosrc', 'hovertemplatesrc', 'key', 'set', 'frame', 'transforms', '_isNestedKey', '_isSimpleKey', '_isGraticule', '_bbox'
## Found a grid already named: 'Divorce in US 2019 Grid'. Since fileopt='overwrite', I'll try to update it
## Found a plot already named: 'Divorce in US 2019'. Since fileopt='overwrite', I'll try to update it
# https://plotly.com/~hanaylim/3/

Barely any changes, most divorce in California

California

Which age group(s) gets divorced the most?

Age groups of 50s and 60s get the most divorce. Females tend to request/get(?) divorce more than men do.

library(ggplot2)
#table(divorce$age)  #7486, 7487
ggplot(divorce, aes(x = age_interval, fill = sex)) + geom_bar() + 
  geom_text(stat = "count", aes(label = ..count..), position = position_stack(vjust = 0.5)) +
  scale_fill_manual(values = c("dodgerblue", "palevioletred")) + 
  ggtitle("Count of Divorce in California by Age Intervals") + 
  theme(plot.title = element_text(hjust = 0.5))

Three plots

library(wesanderson)
library(plotly)
# memory.limit(size=120000)
divorcef %>% filter(statefip == 6) %>% group_by(year) %>% summarise(count = n()) %>% ggplot(aes(x = year, y = count, group = 1)) + geom_point() + geom_line() +
  ggtitle("The divorce trend counts in California from 2009 - 2019")
## `summarise()` ungrouping output (override with `.groups` argument)

divorcef %>% filter(statefip == 6) %>% group_by(year, age_interval) %>% summarise(count = n()) %>% ggplot(aes(x = year, y = count, group = age_interval)) +
  geom_point(aes(color = age_interval)) + geom_line(aes(color = age_interval)) + 
  ggtitle("The divorce trend counts in California from 2009 - 2019 by age groups") 
## `summarise()` regrouping output by 'year' (override with `.groups` argument)

divorcef %>% filter(statefip == 6, age_interval == "10s") %>% group_by(year) %>% summarise(count = n()) %>% 
  ggplot(aes(x = year, y = count, group = 1)) + geom_point() + geom_line() + 
  ggtitle("The divorce trend of the age 15 - 19 in California from 2009 - 2019")
## `summarise()` ungrouping output (override with `.groups` argument)

teendivorce <- divorcef %>% filter(statefip == 6, age_interval == "10s")
teendivorce$no_yr_marr <- as.numeric(as.character(teendivorce$year)) - teendivorce$yrmarr
teendivorce$no_yr_marr <- as.factor(teendivorce$no_yr_marr)

d <- teendivorce %>% group_by(no_yr_marr) %>% summarise(count = n()) %>% ungroup() %>% mutate(prop = count/sum(count))
## `summarise()` ungrouping output (override with `.groups` argument)
d$no_yr_marr <- as.factor(d$no_yr_marr)
# ggplot(d, aes(x = "", y = prop, fill = no_yr_marr)) + geom_bar(stat = "identity", color = "white", width = 1) + 
# coord_polar("y") + theme_void() + 
# geom_text(aes(label = round(prop, 3)*100), position = position_stack(vjust = 0.5)) +
# scale_fill_manual("Duration of marriage", values = wes_palette("Zissou1", n = 5)) + 
# theme(axis.text = element_blank(), axis.ticks = element_blank(), panel.grid = element_blank()) + 
# ggtitle("Marriage year duration for ages of 19 and under (%)")
  
pie <- plot_ly(d, labels = ~no_yr_marr, values = ~prop, type = 'pie',textposition = 'outside',textinfo = 'label+percent') %>%
  layout(title = '2019 Duration of Teen Marriage in California (age 15-19) (%)',
       xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
       yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE)) %>% 
  layout(legend=list(title=list(text='<b> Duration of Marriage </b>')))
pie 
api_create(pie, "Duration of Teen marriage (age 15 - 19)")
## Found a grid already named: 'Duration of Teen marriage (age 15 - 19) Grid'. Since fileopt='overwrite', I'll try to update it
## Found a plot already named: 'Duration of Teen marriage (age 15 - 19)'. Since fileopt='overwrite', I'll try to update it
# https://chart-studio.plotly.com/~hanaylim/7/#/
  1. The divorce rates of the age groups 50s and higher show increasing trend - The divorce rate for elderly couples is growing these days - why?
  • subjective reason: their children are grown-ups
  1. Teen Marriage duration lasts very short (maximum 4 years, with 1 year being the most by 80%)

Teen marriage in 2019

teen %>% filter(marst != "Single") %>% group_by(marst) %>% summarise(count = n()) %>% mutate(prop = count / sum(count)) %>% 
  ggplot(aes(x = "", y = count, fill = marst)) + geom_bar(stat = "identity", width = 1, color = "white") + 
  coord_polar("y", start = 0) + theme_void() + 
  geom_text(aes(label = paste0(round(prop, 3)*100, "%")), position = position_stack(vjust = 0.5)) +
  ggtitle("2019 Age group of 15-19 Marital Status in California") 
## `summarise()` ungrouping output (override with `.groups` argument)

World Divorce data

library(readxl)
library(dplyr)

divorce <- read_excel("SF_3_1_Marriage_divorce_rates.xlsx", sheet = "DivorceRate", skip = 3)
divorce <- divorce[-c(52:nrow(divorce)), -c(2:match("1991", names(divorce)))]

values <- suppressWarnings(as.data.frame(lapply(divorce[, -1], as.numeric), check.names = FALSE))
divorce <- cbind("Country" = divorce[,1], mutate(values, across(where(is.numeric), round, 3)))
divorce
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v tibble  3.0.4     v stringr 1.4.0
## v tidyr   1.1.2     v forcats 0.5.0
## v purrr   0.3.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x plotly::filter() masks dplyr::filter(), stats::filter()
## x dplyr::lag()     masks stats::lag()
test <- divorce %>% gather(year, rate, `1992`:`2017`)
a <- test[-which(is.na(test$rate)), ]

Overall Trend in Divorce Rates

library(ggplot2)

ggplot(data = a, aes(x = year, y = rate, fill = Country)) + geom_bar(stat = "identity") +
  coord_flip() + 
  ggtitle("Overall Trend in Divorce Rates") + 
  theme(plot.title = element_text(size = 20, hjust = 0.5), legend.text = element_text(size = 9), legend.key.size = unit(0.01, 'cm')) + geom_text(size = 2.5, aes(label = rate), position = position_stack(vjust = 0.5))

Top 10 Countries with the highest divorce rate in 1992

before <- divorce[, c(1, 2)]
before_divorce <- before[-which(is.na(before$`1992`)), ] %>% arrange(desc(`1992`))
before_divorce[1:10, ]
before_plot <- ggplot(before_divorce[1:10, ], aes(x = reorder(Country, `1992`), y = `1992`, fill = `1992`)) + 
  geom_bar(stat = "identity") + 
  coord_flip() + 
  xlab("Country") + ylab("Rates in 1992") +
  geom_text(aes(label = `1992`), hjust = -0.15, size = 5) + 
  ggtitle("Top 10 countries with the highest divorce rates in year 1992") + 
  theme(plot.title = element_text(size = 20, hjust = 0.5), 
        axis.text.x = element_text(size = 12), 
        axis.text.y = element_text(size = 12),
        legend.title = element_text(size = 13), 
        legend.text = element_text(size = 13), 
        legend.key.size = unit(1, 'cm')) + 
  scale_fill_continuous(name = "Divorce Rate")

ggplot(before_divorce[1:10, ], aes(x = reorder(Country, `1992`), y = `1992`, fill = `1992`)) + 
  geom_bar(stat = "identity") + 
  coord_flip() + 
  xlab("Country") + ylab("Rates in 1992") +
  geom_text(aes(label = `1992`), hjust = -0.15) + 
  ggtitle("Top 10 countries with the highest divorce rates in year 1992") + 
  theme(plot.title = element_text(hjust = 0.5)) + 
  scale_fill_continuous(name = "Divorce Rate")

Top 10 Countries with the highest divorce rate in recent year (2017)

library(cowplot)
recent <- divorce[, c(1, 27)]
recent_divorce <- recent[-which(is.na(recent$`2017`)), ] %>% arrange(desc(`2017`))
recent_divorce[1:10, ]
recent_plot <- ggplot(recent_divorce[1:10, ], aes(x = reorder(Country, `2017`), y = `2017`, fill = `2017`)) + 
  geom_bar(stat = "identity") + 
  coord_flip() + 
  xlab("Country") + ylab("Rates in 2017") +
  geom_text(aes(label = `2017`), hjust = -0.15, size = 5) + 
  ggtitle("Top 10 countries with the highest divorce rates in year 2017") + 
  theme(plot.title = element_text(size = 20, hjust = 0.5), 
        axis.text.x = element_text(size = 12), 
        axis.text.y = element_text(size = 12),
        legend.text = element_text(size = 13), 
        legend.title = element_text(size = 13), 
        legend.key.size = unit(1, 'cm')) + 
  scale_fill_continuous(name = "Divorce Rate")

ggplot(recent_divorce[1:10, ], aes(x = reorder(Country, `2017`), y = `2017`, fill = `2017`)) + 
  geom_bar(stat = "identity") + 
  coord_flip() + 
  xlab("Country") + ylab("Rates in 2017") +
  geom_text(aes(label = `2017`), hjust = -0.15) + 
  ggtitle("Top 10 countries with the highest divorce rates in year 2017") + 
  theme(plot.title = element_text(hjust = 0.5)) + 
  scale_fill_continuous(name = "Divorce Rate")

plot_grid(before_plot, recent_plot)

is divorce related to the presence of children?